In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [1370]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected FB
In [1371]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [1372]:
pd.set_option('display.max_colwidth', None)
In [1373]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [1374]:
del df['Unnamed: 0']
In [1375]:
df.head(5)
Out[1375]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2020-10-19 265.529999 268.549988 259.880005 261.399994 261.399994 13587000 -1.703455 0.516515 28.213060 7.412785 279.510339 258.263953 268.887146 NaN 2.835802 8.669983 8.814872 NaN NaN NaN -3.250000 NaN -0.012280 50.539201 NaN NaN 15.620213 35.071195 2.473284e+06 -6.455652e+06 79553900.0 1.302967e+07 8.793275e+06 1.302967e+07 0.0 0.0 1.302967e+07 1.302967e+07 8.793275e+06 0.0 0.0 8.793275e+06 8.793275e+06 13 864 877 0 0 877 877 877
1 2020-10-20 263.059998 269.700012 262.880005 267.559998 267.559998 18763200 2.356543 0.525569 25.454696 7.539533 279.421959 259.240900 269.331430 NaN 2.817885 8.300018 4.000441 NaN NaN NaN 8.899994 NaN 0.034408 55.849293 NaN NaN 20.765477 23.150741 9.461270e+06 -5.632550e+06 98317100.0 1.536367e+07 1.036841e+07 1.536367e+07 0.0 0.0 1.536367e+07 1.536367e+07 1.036841e+07 0.0 0.0 1.036841e+07 1.036841e+07 16 1237 1253 0 0 1253 1253 1253
2 2020-10-21 279.559998 283.049988 276.369995 278.730011 278.730011 28998600 4.174770 0.966566 32.007074 8.675312 281.072104 258.442187 269.757145 NaN 3.112443 15.489990 34.967497 NaN NaN NaN 20.610016 NaN 0.079847 63.501306 NaN NaN 43.321051 26.568914 9.528592e+05 -7.490998e+06 127315700.0 2.273140e+07 1.534063e+07 2.273140e+07 0.0 0.0 2.273140e+07 2.273140e+07 1.534063e+07 0.0 0.0 1.534063e+07 1.534063e+07 19 1581 1600 0 0 1600 1600 1600
3 2020-10-22 279.869995 282.450012 275.040009 278.119995 278.119995 16720000 -0.218855 1.005030 36.097958 8.494554 282.056315 258.023685 270.040000 6.637436 3.054277 7.410004 29.334988 NaN NaN NaN 14.359985 2.529470 0.054443 62.860557 NaN NaN 66.925761 43.670763 -1.867721e+06 -8.467729e+06 110595700.0 1.406206e+07 9.489997e+06 1.406206e+07 0.0 0.0 1.406206e+07 1.406206e+07 9.489997e+06 0.0 0.0 9.489997e+06 9.489997e+06 19 1310 1329 0 0 1329 1329 1329
4 2020-10-23 278.799988 285.239990 276.820007 284.790009 284.790009 17535200 2.398250 0.994783 63.292640 8.483901 287.804191 255.981523 271.892857 7.707245 2.979002 8.419983 35.883502 NaN NaN NaN 20.339996 2.924319 0.076914 66.804699 NaN NaN 86.101137 65.449317 1.379324e+07 -3.114474e+06 128130900.0 2.643801e+07 1.784210e+07 2.643801e+07 0.0 0.0 2.643801e+07 2.643801e+07 1.784210e+07 0.0 0.0 1.784210e+07 1.784210e+07 12 1416 1428 0 0 1428 1428 1428
In [1378]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295 entries, 0 to 294
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       295 non-null    datetime64[ns]
 1   Open                       295 non-null    float64       
 2   High                       295 non-null    float64       
 3   Low                        295 non-null    float64       
 4   Close                      295 non-null    float64       
 5   Adj Close                  295 non-null    float64       
 6   Volume                     295 non-null    int64         
 7   Return                     295 non-null    float64       
 8   Beta                       295 non-null    float64       
 9   Variance                   295 non-null    float64       
 10  AvgTrueRange               295 non-null    float64       
 11  Upperband                  295 non-null    float64       
 12  Lowerband                  295 non-null    float64       
 13  Middleband                 295 non-null    float64       
 14  APO                        292 non-null    float64       
 15  NATR                       295 non-null    float64       
 16  TRANGE                     295 non-null    float64       
 17  DMI                        295 non-null    float64       
 18  MACD                       284 non-null    float64       
 19  MACDSIGNAL                 284 non-null    float64       
 20  MACDHIST                   284 non-null    float64       
 21  MOM                        295 non-null    float64       
 22  PPO                        292 non-null    float64       
 23  ROCP                       295 non-null    float64       
 24  RSI                        295 non-null    float64       
 25  TRIX                       229 non-null    float64       
 26  ULTOSC                     289 non-null    float64       
 27  SLOWK                      295 non-null    float64       
 28  SLOWD                      295 non-null    float64       
 29  AD                         295 non-null    float64       
 30  ADOSC                      295 non-null    float64       
 31  OBV                        295 non-null    float64       
 32  Upward_momentum_created    295 non-null    float64       
 33  Downward_momentum_created  295 non-null    float64       
 34  B5_O_Um                    295 non-null    float64       
 35  B5_C_Um                    295 non-null    float64       
 36  B5_E_Um                    295 non-null    float64       
 37  B5_A_Um                    295 non-null    float64       
 38  B5_N_Um                    295 non-null    float64       
 39  B5_O_Dm                    295 non-null    float64       
 40  B5_C_Dm                    295 non-null    float64       
 41  B5_E_Dm                    295 non-null    float64       
 42  B5_A_Dm                    295 non-null    float64       
 43  B5_N_Dm                    295 non-null    float64       
 44  Verified_status_True       295 non-null    int64         
 45  Verified_status_False      295 non-null    int64         
 46  O                          295 non-null    int64         
 47  C                          295 non-null    int64         
 48  E                          295 non-null    int64         
 49  A                          295 non-null    int64         
 50  N                          295 non-null    int64         
 51  Real_or_Fake_tweet         295 non-null    int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 120.0 KB
In [1379]:
df.shape
Out[1379]:
(295, 52)
In [1380]:
sns.set(font_scale=0.8)
In [1381]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [1382]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [1383]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [1384]:
df.head()
Out[1384]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2020-10-19 265.529999 268.549988 259.880005 261.399994 261.399994 13587000 -1.703455 0.516515 28.213060 7.412785 279.510339 258.263953 268.887146 NaN 2.835802 8.669983 8.814872 NaN NaN NaN -3.250000 NaN -0.012280 50.539201 NaN NaN 15.620213 35.071195 2.473284e+06 -6.455652e+06 79553900.0 1.302967e+07 8.793275e+06 1.302967e+07 0.0 0.0 1.302967e+07 1.302967e+07 8.793275e+06 0.0 0.0 8.793275e+06 8.793275e+06 13 864 877 0 0 877 877 877 NaN NaN
1 2020-10-20 263.059998 269.700012 262.880005 267.559998 267.559998 18763200 2.356543 0.525569 25.454696 7.539533 279.421959 259.240900 269.331430 NaN 2.817885 8.300018 4.000441 NaN NaN NaN 8.899994 NaN 0.034408 55.849293 NaN NaN 20.765477 23.150741 9.461270e+06 -5.632550e+06 98317100.0 1.536367e+07 1.036841e+07 1.536367e+07 0.0 0.0 1.536367e+07 1.536367e+07 1.036841e+07 0.0 0.0 1.036841e+07 1.036841e+07 16 1237 1253 0 0 1253 1253 1253 2.356543 0.023292
2 2020-10-21 279.559998 283.049988 276.369995 278.730011 278.730011 28998600 4.174770 0.966566 32.007074 8.675312 281.072104 258.442187 269.757145 NaN 3.112443 15.489990 34.967497 NaN NaN NaN 20.610016 NaN 0.079847 63.501306 NaN NaN 43.321051 26.568914 9.528592e+05 -7.490998e+06 127315700.0 2.273140e+07 1.534063e+07 2.273140e+07 0.0 0.0 2.273140e+07 2.273140e+07 1.534063e+07 0.0 0.0 1.534063e+07 1.534063e+07 19 1581 1600 0 0 1600 1600 1600 4.174770 0.040900
3 2020-10-22 279.869995 282.450012 275.040009 278.119995 278.119995 16720000 -0.218855 1.005030 36.097958 8.494554 282.056315 258.023685 270.040000 6.637436 3.054277 7.410004 29.334988 NaN NaN NaN 14.359985 2.529470 0.054443 62.860557 NaN NaN 66.925761 43.670763 -1.867721e+06 -8.467729e+06 110595700.0 1.406206e+07 9.489997e+06 1.406206e+07 0.0 0.0 1.406206e+07 1.406206e+07 9.489997e+06 0.0 0.0 9.489997e+06 9.489997e+06 19 1310 1329 0 0 1329 1329 1329 -0.218855 -0.002191
4 2020-10-23 278.799988 285.239990 276.820007 284.790009 284.790009 17535200 2.398250 0.994783 63.292640 8.483901 287.804191 255.981523 271.892857 7.707245 2.979002 8.419983 35.883502 NaN NaN NaN 20.339996 2.924319 0.076914 66.804699 NaN NaN 86.101137 65.449317 1.379324e+07 -3.114474e+06 128130900.0 2.643801e+07 1.784210e+07 2.643801e+07 0.0 0.0 2.643801e+07 2.643801e+07 1.784210e+07 0.0 0.0 1.784210e+07 1.784210e+07 12 1416 1428 0 0 1428 1428 1428 2.398250 0.023699
In [1385]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [1386]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [1387]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [1388]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [1389]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [1390]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [1391]:
df.describe()
Out[1391]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 229.000000 229.000000 229.000000 229.000000 229.000000 2.290000e+02 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 2.290000e+02 2.290000e+02 2.290000e+02 2.290000e+02 2.290000e+02 2.290000e+02 229.0 229.0 2.290000e+02 2.290000e+02 2.290000e+02 229.0 229.0 2.290000e+02 2.290000e+02 229.000000 229.000000 229.000000 229.0 229.0 229.000000 229.000000 229.000000 229.000000 229.000000 200.000000 193.000000
mean 324.028471 327.934497 320.328558 324.106769 324.106769 1.882184e+07 0.102095 0.707394 37.561476 8.241042 334.292867 312.135393 323.214130 1.894761 2.582427 8.307686 30.431005 1.919762 1.876277 0.043485 3.097380 0.642281 0.011773 53.337122 0.095213 50.616765 53.635262 53.742087 2.648997e+07 -1.210092e+06 1.215471e+08 1.647925e+07 1.112127e+07 1.647925e+07 0.0 0.0 1.647925e+07 1.647925e+07 1.112127e+07 0.0 0.0 1.112127e+07 1.112127e+07 23.222707 1007.358079 1030.580786 0.0 0.0 1030.580786 1030.580786 1030.580786 0.102095 0.000854 0.018271 0.018216
std 32.997093 32.703978 33.067166 33.132561 33.132561 8.513970e+06 1.830289 0.387364 35.584550 1.483687 33.613741 33.572936 33.179582 6.869757 0.594363 3.891468 21.144859 4.985253 4.700879 1.585270 15.426772 2.123343 0.048434 9.684182 0.151454 7.761885 24.063109 22.162850 4.168374e+07 8.527498e+06 5.669391e+07 2.568582e+07 1.733447e+07 2.568582e+07 0.0 0.0 2.568582e+07 2.568582e+07 1.733447e+07 0.0 0.0 1.733447e+07 1.733447e+07 29.492018 698.387160 725.900535 0.0 0.0 725.900535 725.900535 725.900535 1.830289 0.018287 0.003003 0.003042
min 256.470001 260.989990 253.500000 254.690002 254.690002 7.170700e+06 -5.051486 -0.056048 1.422975 5.910309 266.451929 241.728177 259.078574 -17.979233 1.621675 2.839996 0.024934 -10.432474 -9.295748 -4.351803 -35.880005 -5.021933 -0.095291 28.067550 -0.152559 29.853721 5.376144 8.780267 -7.038640e+07 -3.009619e+07 -3.779310e+07 3.385735e+06 2.284915e+06 3.385735e+06 0.0 0.0 3.385735e+06 3.385735e+06 2.284915e+06 0.0 0.0 2.284915e+06 2.284915e+06 2.000000 475.000000 480.000000 0.0 0.0 480.000000 480.000000 480.000000 -5.051486 -0.051835 0.010485 0.010485
25% 304.279999 308.029999 301.109985 305.260010 305.260010 1.356620e+07 -1.027929 0.477996 11.916671 6.949356 315.007276 295.786478 304.705715 -1.734679 2.058120 5.649994 12.539127 -1.046619 -1.334955 -0.906405 -6.679993 -0.555130 -0.021627 48.709038 -0.050297 44.759407 31.767166 34.863283 -7.389696e+06 -6.502618e+06 8.999400e+07 6.174700e+06 4.167091e+06 6.174700e+06 0.0 0.0 6.174700e+06 6.174700e+06 4.167091e+06 0.0 0.0 4.167091e+06 4.167091e+06 11.000000 666.000000 679.000000 0.0 0.0 679.000000 679.000000 679.000000 -1.027929 -0.010332 0.016077 0.015847
50% 330.149994 333.450012 326.640015 329.820007 329.820007 1.654710e+07 0.005309 0.706678 28.115329 8.077672 339.223727 320.531531 329.592861 3.156665 2.559623 7.399994 27.260878 3.025517 3.761067 0.233736 5.059998 0.962986 0.016028 55.161008 0.162236 50.075088 55.257362 53.702117 2.813018e+07 5.394893e+03 1.321155e+08 8.559627e+06 5.776596e+06 8.559627e+06 0.0 0.0 8.559627e+06 8.559627e+06 5.776596e+06 0.0 0.0 5.776596e+06 5.776596e+06 15.000000 804.000000 820.000000 0.0 0.0 820.000000 820.000000 820.000000 0.005309 0.000053 0.019216 0.018518
75% 345.730011 351.540009 342.369995 346.230011 346.230011 2.192460e+07 1.380014 0.939990 48.194022 9.459607 357.263115 335.440268 347.635707 5.851986 2.968448 9.840012 45.009003 5.876812 5.552583 1.168752 14.619995 1.804830 0.045355 60.328975 0.244970 56.461953 74.625917 73.076926 6.039061e+07 5.144554e+06 1.665046e+08 1.276212e+07 8.612716e+06 1.276212e+07 0.0 0.0 1.276212e+07 1.276212e+07 8.612716e+06 0.0 0.0 8.612716e+06 8.612716e+06 22.000000 1033.000000 1043.000000 0.0 0.0 1043.000000 1043.000000 1043.000000 1.380014 0.013706 0.020200 0.020201
max 381.679993 384.329987 378.809998 382.179993 382.179993 6.565400e+07 7.297298 2.058409 183.244044 11.724620 387.069852 374.143195 379.054286 13.693526 4.269934 24.709991 80.290577 10.929499 9.458732 3.273955 38.230011 4.750806 0.149740 72.738611 0.268682 69.292585 92.449478 92.345203 1.004125e+08 1.618555e+07 2.200628e+08 1.688531e+08 1.139531e+08 1.688531e+08 0.0 0.0 1.688531e+08 1.688531e+08 1.139531e+08 0.0 0.0 1.139531e+08 1.139531e+08 228.000000 6040.000000 6221.000000 0.0 0.0 6221.000000 6221.000000 6221.000000 7.297298 0.070433 0.023392 0.023392
In [1392]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1393]:
df = df.fillna(df.median())
In [1394]:
df.isna().sum()
Out[1394]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1395]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 229 entries, 66 to 294
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       229 non-null    datetime64[ns]
 1   Open                       229 non-null    float64       
 2   High                       229 non-null    float64       
 3   Low                        229 non-null    float64       
 4   Close                      229 non-null    float64       
 5   Adj Close                  229 non-null    float64       
 6   Volume                     229 non-null    int64         
 7   Return                     229 non-null    float64       
 8   Beta                       229 non-null    float64       
 9   Variance                   229 non-null    float64       
 10  AvgTrueRange               229 non-null    float64       
 11  Upperband                  229 non-null    float64       
 12  Lowerband                  229 non-null    float64       
 13  Middleband                 229 non-null    float64       
 14  APO                        229 non-null    float64       
 15  NATR                       229 non-null    float64       
 16  TRANGE                     229 non-null    float64       
 17  DMI                        229 non-null    float64       
 18  MACD                       229 non-null    float64       
 19  MACDSIGNAL                 229 non-null    float64       
 20  MACDHIST                   229 non-null    float64       
 21  MOM                        229 non-null    float64       
 22  PPO                        229 non-null    float64       
 23  ROCP                       229 non-null    float64       
 24  RSI                        229 non-null    float64       
 25  TRIX                       229 non-null    float64       
 26  ULTOSC                     229 non-null    float64       
 27  SLOWK                      229 non-null    float64       
 28  SLOWD                      229 non-null    float64       
 29  AD                         229 non-null    float64       
 30  ADOSC                      229 non-null    float64       
 31  OBV                        229 non-null    float64       
 32  Upward_momentum_created    229 non-null    float64       
 33  Downward_momentum_created  229 non-null    float64       
 34  B5_O_Um                    229 non-null    float64       
 35  B5_C_Um                    229 non-null    float64       
 36  B5_E_Um                    229 non-null    float64       
 37  B5_A_Um                    229 non-null    float64       
 38  B5_N_Um                    229 non-null    float64       
 39  B5_O_Dm                    229 non-null    float64       
 40  B5_C_Dm                    229 non-null    float64       
 41  B5_E_Dm                    229 non-null    float64       
 42  B5_A_Dm                    229 non-null    float64       
 43  B5_N_Dm                    229 non-null    float64       
 44  Verified_status_True       229 non-null    int64         
 45  Verified_status_False      229 non-null    int64         
 46  O                          229 non-null    int64         
 47  C                          229 non-null    int64         
 48  E                          229 non-null    int64         
 49  A                          229 non-null    int64         
 50  N                          229 non-null    int64         
 51  Fake_news                  229 non-null    int64         
 52  returns                    229 non-null    float64       
 53  log_returns                229 non-null    float64       
 54  vol_current                229 non-null    float64       
 55  vol_future                 229 non-null    float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 102.0 KB
In [1396]:
df.shape
Out[1396]:
(229, 56)
In [1397]:
df=df.dropna()
In [1398]:
df.dtypes
Out[1398]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1399]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[1399]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077b8da690>
In [1400]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [1401]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
NATR            0.884366
Volume          0.582479
TRANGE          0.551637
Variance        0.543889
vol_future      0.535772
vol_current     0.515088
ADOSC          -0.520882
MACDSIGNAL     -0.540871
TRIX           -0.665833
Name: AvgTrueRange, dtype: float64
In [1402]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.884366
vol_future      0.639271
vol_current     0.629907
Volume          0.548559
MACDSIGNAL     -0.515302
ADOSC          -0.522255
ULTOSC         -0.533084
Upperband      -0.551986
OBV            -0.574327
AD             -0.595478
High           -0.627074
Middleband     -0.630286
Open           -0.646912
Adj Close      -0.656804
Close          -0.656804
Low            -0.668555
Lowerband      -0.693145
TRIX           -0.729206
Name: NATR, dtype: float64
In [1403]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 8 strongly correlated values with TRANGE:
TRANGE                   1.000000
Volume                   0.750758
AvgTrueRange             0.551637
Verified_status_False    0.547878
Fake_news                0.544462
N                        0.544462
A                        0.544462
O                        0.544462
Name: TRANGE, dtype: float64
In [1404]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
O                            1.000000
Verified_status_False        0.999889
Verified_status_True         0.935544
Volume                       0.767866
B5_N_Dm                      0.604301
B5_A_Dm                      0.604301
B5_O_Dm                      0.604301
Downward_momentum_created    0.604301
B5_N_Um                      0.604301
B5_A_Um                      0.604301
B5_O_Um                      0.604301
Upward_momentum_created      0.604301
TRANGE                       0.544462
Name: O, dtype: float64
In [1405]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [1406]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [1407]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
O                            1.000000
Verified_status_False        0.999889
Verified_status_True         0.935544
Volume                       0.767866
B5_N_Dm                      0.604301
B5_A_Dm                      0.604301
B5_O_Dm                      0.604301
Downward_momentum_created    0.604301
B5_N_Um                      0.604301
B5_A_Um                      0.604301
B5_O_Um                      0.604301
Upward_momentum_created      0.604301
TRANGE                       0.544462
Name: A, dtype: float64
In [1408]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
O                            1.000000
Verified_status_False        0.999889
Verified_status_True         0.935544
Volume                       0.767866
B5_N_Dm                      0.604301
B5_A_Dm                      0.604301
B5_O_Dm                      0.604301
Downward_momentum_created    0.604301
B5_N_Um                      0.604301
B5_A_Um                      0.604301
B5_O_Um                      0.604301
Upward_momentum_created      0.604301
TRANGE                       0.544462
Name: N, dtype: float64
In [1409]:
df.columns
Out[1409]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [1410]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_O_Um:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.667880
Fake_news                    0.604301
N                            0.604301
A                            0.604301
O                            0.604301
Verified_status_False        0.599904
Volume                       0.523574
Name: B5_O_Um, dtype: float64
In [1411]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1412]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1413]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_A_Um:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.667880
Fake_news                    0.604301
N                            0.604301
A                            0.604301
O                            0.604301
Verified_status_False        0.599904
Volume                       0.523574
Name: B5_A_Um, dtype: float64
In [1414]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_N_Um:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.667880
Fake_news                    0.604301
N                            0.604301
A                            0.604301
O                            0.604301
Verified_status_False        0.599904
Volume                       0.523574
Name: B5_N_Um, dtype: float64

Downward momentum correlation

In [1415]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_O_Dm:
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.667880
Fake_news                    0.604301
N                            0.604301
A                            0.604301
O                            0.604301
Verified_status_False        0.599904
Volume                       0.523574
Name: B5_O_Dm, dtype: float64
In [1416]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [1417]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [1418]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_A_Dm:
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.667880
Fake_news                    0.604301
N                            0.604301
A                            0.604301
O                            0.604301
Verified_status_False        0.599904
Volume                       0.523574
Name: B5_A_Dm, dtype: float64
In [1419]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_N_Dm:
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.667880
Fake_news                    0.604301
N                            0.604301
A                            0.604301
O                            0.604301
Verified_status_False        0.599904
Volume                       0.523574
Name: B5_N_Dm, dtype: float64
In [1420]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
A                            1.000000
O                            1.000000
Verified_status_False        0.999889
Verified_status_True         0.935544
Volume                       0.767866
B5_N_Dm                      0.604301
B5_A_Dm                      0.604301
B5_O_Dm                      0.604301
Downward_momentum_created    0.604301
B5_N_Um                      0.604301
B5_A_Um                      0.604301
B5_O_Um                      0.604301
Upward_momentum_created      0.604301
TRANGE                       0.544462
Name: Fake_news, dtype: float64
In [1421]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Downward_momentum_created :
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.667880
Fake_news                    0.604301
N                            0.604301
A                            0.604301
O                            0.604301
Verified_status_False        0.599904
Volume                       0.523574
Name: Downward_momentum_created, dtype: float64
In [1422]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Upward_momentum_created :
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.667880
Fake_news                    0.604301
N                            0.604301
A                            0.604301
O                            0.604301
Verified_status_False        0.599904
Volume                       0.523574
Name: Upward_momentum_created, dtype: float64
In [1423]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Fake_news                    0.935544
N                            0.935544
A                            0.935544
O                            0.935544
Verified_status_False        0.930171
Volume                       0.675078
B5_N_Um                      0.667880
B5_A_Um                      0.667880
B5_O_Um                      0.667880
Upward_momentum_created      0.667880
B5_N_Dm                      0.667880
B5_A_Dm                      0.667880
B5_O_Dm                      0.667880
Downward_momentum_created    0.667880
Name: Verified_status_True, dtype: float64
In [1424]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999889
N                            0.999889
A                            0.999889
O                            0.999889
Verified_status_True         0.930171
Volume                       0.769608
B5_N_Um                      0.599904
B5_A_Um                      0.599904
B5_O_Um                      0.599904
Upward_momentum_created      0.599904
B5_N_Dm                      0.599904
B5_A_Dm                      0.599904
B5_O_Dm                      0.599904
Downward_momentum_created    0.599904
TRANGE                       0.547878
Name: Verified_status_False, dtype: float64
In [1425]:
sns.set(font_scale=0.8)
In [1426]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [1427]:
df.dtypes
Out[1427]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1428]:
df.isnull().sum()
Out[1428]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1429]:
df.fillna(0, inplace = True)
In [1430]:
df.dropna(inplace=True)
In [1431]:
sns.set(font_scale=0.8)
In [1432]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [1433]:
df.describe()
Out[1433]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 229.000000 229.000000 229.000000 229.000000 229.000000 2.290000e+02 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 2.290000e+02 2.290000e+02 2.290000e+02 2.290000e+02 2.290000e+02 2.290000e+02 229.0 229.0 2.290000e+02 2.290000e+02 2.290000e+02 229.0 229.0 2.290000e+02 2.290000e+02 229.000000 229.000000 229.000000 229.0 229.0 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000
mean 324.028471 327.934497 320.328558 324.106769 324.106769 1.882184e+07 0.102095 0.707394 37.561476 8.241042 334.292867 312.135393 323.214130 1.894761 2.582427 8.307686 30.431005 1.919762 1.876277 0.043485 3.097380 0.642281 0.011773 53.337122 0.095213 50.616765 53.635262 53.742087 2.648997e+07 -1.210092e+06 1.215471e+08 1.647925e+07 1.112127e+07 1.647925e+07 0.0 0.0 1.647925e+07 1.647925e+07 1.112127e+07 0.0 0.0 1.112127e+07 1.112127e+07 23.222707 1007.358079 1030.580786 0.0 0.0 1030.580786 1030.580786 1030.580786 0.102095 0.000854 0.018391 0.018263
std 32.997093 32.703978 33.067166 33.132561 33.132561 8.513970e+06 1.830289 0.387364 35.584550 1.483687 33.613741 33.572936 33.179582 6.869757 0.594363 3.891468 21.144859 4.985253 4.700879 1.585270 15.426772 2.123343 0.048434 9.684182 0.151454 7.761885 24.063109 22.162850 4.168374e+07 8.527498e+06 5.669391e+07 2.568582e+07 1.733447e+07 2.568582e+07 0.0 0.0 2.568582e+07 2.568582e+07 1.733447e+07 0.0 0.0 1.733447e+07 1.733447e+07 29.492018 698.387160 725.900535 0.0 0.0 725.900535 725.900535 725.900535 1.830289 0.018287 0.002823 0.002793
min 256.470001 260.989990 253.500000 254.690002 254.690002 7.170700e+06 -5.051486 -0.056048 1.422975 5.910309 266.451929 241.728177 259.078574 -17.979233 1.621675 2.839996 0.024934 -10.432474 -9.295748 -4.351803 -35.880005 -5.021933 -0.095291 28.067550 -0.152559 29.853721 5.376144 8.780267 -7.038640e+07 -3.009619e+07 -3.779310e+07 3.385735e+06 2.284915e+06 3.385735e+06 0.0 0.0 3.385735e+06 3.385735e+06 2.284915e+06 0.0 0.0 2.284915e+06 2.284915e+06 2.000000 475.000000 480.000000 0.0 0.0 480.000000 480.000000 480.000000 -5.051486 -0.051835 0.010485 0.010485
25% 304.279999 308.029999 301.109985 305.260010 305.260010 1.356620e+07 -1.027929 0.477996 11.916671 6.949356 315.007276 295.786478 304.705715 -1.734679 2.058120 5.649994 12.539127 -1.046619 -1.334955 -0.906405 -6.679993 -0.555130 -0.021627 48.709038 -0.050297 44.759407 31.767166 34.863283 -7.389696e+06 -6.502618e+06 8.999400e+07 6.174700e+06 4.167091e+06 6.174700e+06 0.0 0.0 6.174700e+06 6.174700e+06 4.167091e+06 0.0 0.0 4.167091e+06 4.167091e+06 11.000000 666.000000 679.000000 0.0 0.0 679.000000 679.000000 679.000000 -1.027929 -0.010332 0.016257 0.016257
50% 330.149994 333.450012 326.640015 329.820007 329.820007 1.654710e+07 0.005309 0.706678 28.115329 8.077672 339.223727 320.531531 329.592861 3.156665 2.559623 7.399994 27.260878 3.025517 3.761067 0.233736 5.059998 0.962986 0.016028 55.161008 0.162236 50.075088 55.257362 53.702117 2.813018e+07 5.394893e+03 1.321155e+08 8.559627e+06 5.776596e+06 8.559627e+06 0.0 0.0 8.559627e+06 8.559627e+06 5.776596e+06 0.0 0.0 5.776596e+06 5.776596e+06 15.000000 804.000000 820.000000 0.0 0.0 820.000000 820.000000 820.000000 0.005309 0.000053 0.019216 0.018518
75% 345.730011 351.540009 342.369995 346.230011 346.230011 2.192460e+07 1.380014 0.939990 48.194022 9.459607 357.263115 335.440268 347.635707 5.851986 2.968448 9.840012 45.009003 5.876812 5.552583 1.168752 14.619995 1.804830 0.045355 60.328975 0.244970 56.461953 74.625917 73.076926 6.039061e+07 5.144554e+06 1.665046e+08 1.276212e+07 8.612716e+06 1.276212e+07 0.0 0.0 1.276212e+07 1.276212e+07 8.612716e+06 0.0 0.0 8.612716e+06 8.612716e+06 22.000000 1033.000000 1043.000000 0.0 0.0 1043.000000 1043.000000 1043.000000 1.380014 0.013706 0.020118 0.020110
max 381.679993 384.329987 378.809998 382.179993 382.179993 6.565400e+07 7.297298 2.058409 183.244044 11.724620 387.069852 374.143195 379.054286 13.693526 4.269934 24.709991 80.290577 10.929499 9.458732 3.273955 38.230011 4.750806 0.149740 72.738611 0.268682 69.292585 92.449478 92.345203 1.004125e+08 1.618555e+07 2.200628e+08 1.688531e+08 1.139531e+08 1.688531e+08 0.0 0.0 1.688531e+08 1.688531e+08 1.139531e+08 0.0 0.0 1.139531e+08 1.139531e+08 228.000000 6040.000000 6221.000000 0.0 0.0 6221.000000 6221.000000 6221.000000 7.297298 0.070433 0.023392 0.023392
In [1434]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [1435]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [1436]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [1438]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected FB
In [1439]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [1440]:
df.columns
Out[1440]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [1441]:
df.shape
Out[1441]:
(295, 52)
In [1442]:
df.isnull().sum()
Out[1442]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           3
NATR                          0
TRANGE                        0
DMI                           0
MACD                         11
MACDSIGNAL                   11
MACDHIST                     11
MOM                           0
PPO                           3
ROCP                          0
RSI                           0
TRIX                         66
ULTOSC                        6
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [1443]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1444]:
df_weekly = df.resample('W').agg('mean')
In [1445]:
df_weekly.shape
Out[1445]:
(61, 51)
In [1446]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[1446]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077d2240d0>
In [1447]:
sns.set(font_scale=0.8)
In [1448]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [1449]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
NATR            0.868423
TRANGE          0.698232
Variance        0.695498
Volume          0.589164
TRIX           -0.559909
Name: AvgTrueRange, dtype: float64
In [1450]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.868423
TRANGE          0.613417
Volume          0.563632
Variance        0.530985
OBV            -0.514077
TRIX           -0.526836
AD             -0.570234
Upperband      -0.570920
High           -0.637673
Middleband     -0.646430
Open           -0.649958
Adj Close      -0.655013
Close          -0.655013
Low            -0.662618
Lowerband      -0.710897
Name: NATR, dtype: float64
In [1451]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with TRANGE:
TRANGE                       1.000000
Volume                       0.758764
AvgTrueRange                 0.698232
NATR                         0.613417
Variance                     0.589599
Verified_status_False        0.566789
N                            0.566622
Fake_news                    0.566622
O                            0.566622
A                            0.566622
Verified_status_True         0.528294
B5_A_Um                      0.504942
B5_O_Um                      0.504942
Upward_momentum_created      0.504942
B5_N_Um                      0.504942
B5_O_Dm                      0.504942
B5_A_Dm                      0.504942
Downward_momentum_created    0.504942
B5_N_Dm                      0.504942
Name: TRANGE, dtype: float64
In [1452]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
O                            1.000000
Verified_status_False        0.999898
Verified_status_True         0.941721
Volume                       0.825803
B5_N_Um                      0.612428
B5_A_Um                      0.612428
B5_O_Um                      0.612428
Upward_momentum_created      0.612428
B5_N_Dm                      0.612428
B5_A_Dm                      0.612428
B5_O_Dm                      0.612428
Downward_momentum_created    0.612428
TRANGE                       0.566622
Name: O, dtype: float64
In [1453]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [1454]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [1455]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
O                            1.000000
Verified_status_False        0.999898
Verified_status_True         0.941721
Volume                       0.825803
B5_N_Um                      0.612428
B5_A_Um                      0.612428
B5_O_Um                      0.612428
Upward_momentum_created      0.612428
B5_N_Dm                      0.612428
B5_A_Dm                      0.612428
B5_O_Dm                      0.612428
Downward_momentum_created    0.612428
TRANGE                       0.566622
Name: A, dtype: float64
In [1456]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
O                            1.000000
Verified_status_False        0.999898
Verified_status_True         0.941721
Volume                       0.825803
B5_N_Um                      0.612428
B5_A_Um                      0.612428
B5_O_Um                      0.612428
Upward_momentum_created      0.612428
B5_N_Dm                      0.612428
B5_A_Dm                      0.612428
B5_O_Dm                      0.612428
Downward_momentum_created    0.612428
TRANGE                       0.566622
Name: N, dtype: float64
In [1457]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_O_Um:
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.752767
Fake_news                    0.612428
N                            0.612428
A                            0.612428
O                            0.612428
Verified_status_False        0.604906
Volume                       0.569333
TRANGE                       0.504942
Name: B5_O_Um, dtype: float64
In [1458]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1459]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1460]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_A_Um:
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.752767
Fake_news                    0.612428
N                            0.612428
A                            0.612428
O                            0.612428
Verified_status_False        0.604906
Volume                       0.569333
TRANGE                       0.504942
Name: B5_A_Um, dtype: float64
In [1461]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_N_Um:
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.752767
Fake_news                    0.612428
N                            0.612428
A                            0.612428
O                            0.612428
Verified_status_False        0.604906
Volume                       0.569333
TRANGE                       0.504942
Name: B5_N_Um, dtype: float64

Downward momentum correlation

In [1462]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.752767
Fake_news                    0.612428
N                            0.612428
A                            0.612428
O                            0.612428
Verified_status_False        0.604906
Volume                       0.569333
TRANGE                       0.504942
Name: B5_O_Dm, dtype: float64
In [1463]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [1464]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [1465]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_A_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.752767
Fake_news                    0.612428
N                            0.612428
A                            0.612428
O                            0.612428
Verified_status_False        0.604906
Volume                       0.569333
TRANGE                       0.504942
Name: B5_A_Dm, dtype: float64
In [1466]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.752767
Fake_news                    0.612428
N                            0.612428
A                            0.612428
O                            0.612428
Verified_status_False        0.604906
Volume                       0.569333
TRANGE                       0.504942
Name: B5_N_Dm, dtype: float64
In [1467]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
A                            1.000000
O                            1.000000
Verified_status_False        0.999898
Verified_status_True         0.941721
Volume                       0.825803
B5_N_Um                      0.612428
B5_A_Um                      0.612428
B5_O_Um                      0.612428
Upward_momentum_created      0.612428
B5_N_Dm                      0.612428
B5_A_Dm                      0.612428
B5_O_Dm                      0.612428
Downward_momentum_created    0.612428
TRANGE                       0.566622
Name: Fake_news, dtype: float64
In [1468]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.752767
Fake_news                    0.612428
N                            0.612428
A                            0.612428
O                            0.612428
Verified_status_False        0.604906
Volume                       0.569333
TRANGE                       0.504942
Name: Downward_momentum_created, dtype: float64
In [1469]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Upward_momentum_created :
B5_N_Um                      1.000000
B5_A_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.752767
Fake_news                    0.612428
N                            0.612428
A                            0.612428
O                            0.612428
Verified_status_False        0.604906
Volume                       0.569333
TRANGE                       0.504942
Name: Upward_momentum_created, dtype: float64
In [1470]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Fake_news                    0.941721
N                            0.941721
A                            0.941721
O                            0.941721
Verified_status_False        0.936830
Volume                       0.757491
B5_N_Um                      0.752767
B5_A_Um                      0.752767
B5_O_Um                      0.752767
Upward_momentum_created      0.752767
B5_N_Dm                      0.752767
B5_A_Dm                      0.752767
B5_O_Dm                      0.752767
Downward_momentum_created    0.752767
TRANGE                       0.528294
Name: Verified_status_True, dtype: float64
In [1471]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999898
N                            0.999898
A                            0.999898
O                            0.999898
Verified_status_True         0.936830
Volume                       0.826575
B5_N_Um                      0.604906
B5_A_Um                      0.604906
B5_O_Um                      0.604906
Upward_momentum_created      0.604906
B5_N_Dm                      0.604906
B5_A_Dm                      0.604906
B5_O_Dm                      0.604906
Downward_momentum_created    0.604906
TRANGE                       0.566789
Name: Verified_status_False, dtype: float64
In [1472]:
sns.set(font_scale=0.8)
In [1473]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [1474]:
df_weekly.fillna(0, inplace = True)
In [1475]:
df_weekly.dropna(inplace=True)
In [1476]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [1477]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();